In [32]:
# imports
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
# Ploting Libs
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')
# For Model Evaluation
from sklearn.model_selection import cross_val_score
Load the titanic train and test data
In [33]:
# create the Dataframes from datasets
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')
In [34]:
# Display the first 5 rows
train.head()
Out[34]:
In [35]:
# Display the Last 5 rows
train.tail()
Out[35]:
In [36]:
# See the shape of our datasets (rows, columns)
print(train.shape)
print(test.shape)
In [37]:
train.describe()
Out[37]:
In [7]:
train.info()
In [8]:
# Missing columns data
train.isnull().sum()
Out[8]:
In [9]:
test.isnull().sum()
Out[9]:
In [10]:
# Look for the most common value for Embarked
train['Embarked'].value_counts()
Out[10]:
In [11]:
# Embarked
# From this we can see there're a lot of age columns with missing values
# Cabin has a lot of missing values, let's assume it doesn't play
# a big rule in our predictions
train = train.drop(['PassengerId','Ticket','Cabin', 'Name'], axis=1)
test = test.drop(['Ticket','Cabin', 'Name'], axis=1)
# fill the two missing values with the most occurred value (S)
train['Embarked'] = train['Embarked'].fillna("S")
# Set up the matplotlib figure
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(10,5))
# Draw barplot to show survivors for Embarked considering Sex
sns.factorplot(x="Embarked", y="Survived", hue="Sex", data=train,
size=6, kind="bar")
sns.countplot(x='Embarked', data=train, ax=axis1)
sns.countplot(x='Survived', hue="Embarked", data=train, order=[0,1], ax=axis2)
Out[11]:
In [12]:
# Fare
# CLean Fare in the test dataset
# Fill the missing value
test['Fare'] = test['Fare'].fillna(test['Fare'].median())
# use int instead of float
train['Fare'] = train['Fare'].astype(int)
test['Fare'] = test['Fare'].astype(int)
# create plot
train['Fare'].plot(kind='hist', figsize=(10,5),bins=100, xlim=(0,50))
Out[12]:
In [13]:
# Age
# fill "NaN" values in empty Age columns
train['Age'][np.isnan(train['Age'])] = train['Age'].mean()
test['Age'][np.isnan(test['Age'])] = test['Age'].mean()
# Now that we haven't missing values we can
# use Age as int instead float
train['Age'] = train['Age'].astype(int)
test['Age'] = test['Age'].astype(int)
# plot the distribuition of people by age
train['Age'].plot(kind='hist', bins=50)
Out[13]:
In [14]:
# Sex
fig, (axis1,axis2) = plt.subplots(1,2,sharex=True,figsize=(10,5))
# How many people survived Vs Died
#sns.countplot(x="Survived", data=train, palette="muted")
sns.countplot(x='Survived', data=train, order=[0,1], ax=axis1)
axis1.set_ylabel('Frequency')
# Survived people by their gender
#sns.countplot(x="Survived", hue='Sex', data=train, palette="muted")
sns.countplot(x='Survived', hue="Sex", data=train, order=[0,1], ax=axis2)
axis2.set_ylabel("Frequency")
axis1.set_xticklabels(['Survived', 'Died'], rotation=0)
axis2.set_xticklabels(['Survived', 'Died'], rotation=0)
Out[14]:
In [15]:
# Transforming categorical data to numeric data for our machine learning model
# Current categorical data: Sex, Embarked
# Sex
train.loc[train["Sex"] == "male", "Sex"] = 0
train.loc[train["Sex"] == "female", "Sex"] = 1
test.loc[test["Sex"] == "male", "Sex"] = 0
test.loc[test["Sex"] == "female", "Sex"] = 1
# Embarked
train.loc[train["Embarked"] == "S", "Embarked"] = 0
train.loc[train["Embarked"] == "C", "Embarked"] = 1
train.loc[train["Embarked"] == "Q", "Embarked"] = 2
test.loc[test["Embarked"] == "S", "Embarked"] = 0
test.loc[test["Embarked"] == "C", "Embarked"] = 1
test.loc[test["Embarked"] == "Q", "Embarked"] = 2
train.head()
Out[15]:
In [16]:
# Now we're ready to build the machine learning model
X = train.drop(['Survived'], axis=1) # instances to learn from
y = train['Survived'] # target/responses the model is trying to learn to predict
In [22]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
# Make predictions using the test set.
model.fit(X, y)
# Evaluate acuracy score of the Decision Tree Classifier
# We got 78% with this model
print(cross_val_score(model, X, y, cv=10, scoring='accuracy').mean())
In [23]:
test.head()
Out[23]:
In [24]:
predictions = model.predict(test.drop(['PassengerId'], axis=1))
Create Submissionn File
In [38]:
# Create a new dataframe
submission = DataFrame({
"PassengerId": test["PassengerId"],
"Survived": predictions
})
submission.to_csv('submission_dt', index=False)